suppressPackageStartupMessages(library(tidyverse))
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
devtools::load_all('~/Google Drive/My Drive/Scripts/R_packages/myUtilities/')
## ℹ Loading myUtilities

Settings

data_dir <- '/Volumes/Mitsu_NGS_3/METTL2A/'

wd <- "~/Google Drive/My Drive/Analysis/METTL2A/"
setwd(wd)

figdir <- paste0(wd, 'Figures/DRS_m3C_RNAs/Parameters/')
tabledir <- paste0(wd, 'Tables/Espresso/')

theme_set(
  theme_classic(base_size = 7) +
    theme(legend.position = 'bottom')
)

Functions

paste_wd <- function(path) {
  
  paste0(wd, path)
  
}


read_readcounts_Espresso_unspliced <- function(path) {
  read_delim(
    path,
    delim = ' ', col_names = c('count', 'transcript_id')
  ) |> 
    mutate(basename = basename(path))
}

add_yrange <- function(df) {
  
  new_df <-  df |> 
    mutate(ymax = cumsum(percentage / 100))
  new_df$ymin <- c(0, head(new_df$ymax, n = -1))
  return(new_df)
  
}

donutplot <- function(df, .var) {
  
  df |> 
    add_yrange() |> 
    ggplot(aes(
      xmin = 2, xmax = 4, ymin = ymin, ymax = ymax,
      fill = {{ .var }}, colour = {{ .var }}         
    )) +
    geom_rect() +
    coord_polar(theta = 'y') +  
    ggrepel::geom_text_repel(
      aes(label = {{ .var }}, y = (ymin + ymax) / 2), x = 1
    ) +
    xlim(c(-1,4)) +
    scale_fill_manual(values = c('blue', 'red')) +
    scale_color_manual(values = c('blue', 'red')) +
    theme_void() 
  
}

Read data

espresso_deseq2 <- 
  read_tsv(
    'Tables/Espresso/espresso_deseq2_genetype2_isDET_2024-04-18.tsv' |> paste_wd()
  )
## Rows: 36717 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (11): transcript_id, transcript_type, transcript_name, gene_id, gene_typ...
## dbl (18): siMETTL2A_baseMean, siMETTL2A_log2FoldChange, siMETTL2A_lfcSE, siM...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
espresso_deseq2 
## # A tibble: 36,717 × 29
##    transcript_id     transcript_type transcript_name gene_id gene_type gene_name
##    <chr>             <chr>           <chr>           <chr>   <chr>     <chr>    
##  1 ENST00000498442.1 retained_intron CRBN-212        ENSG00… protein_… CRBN     
##  2 ENST00000459840.5 retained_intron CRBN-205        ENSG00… protein_… CRBN     
##  3 ENST00000231948.9 protein_coding  CRBN-201        ENSG00… protein_… CRBN     
##  4 ENST00000432408.6 protein_coding  CRBN-203        ENSG00… protein_… CRBN     
##  5 ENST00000339437.… protein_coding  TRNT1-203       ENSG00… protein_… TRNT1    
##  6 ENST00000488263.5 retained_intron CRBN-209        ENSG00… protein_… CRBN     
##  7 ENST00000420393.5 protein_coding  TRNT1-207       ENSG00… protein_… TRNT1    
##  8 ENST00000698415.1 retained_intron TRNT1-230       ENSG00… protein_… TRNT1    
##  9 ENST00000450014.1 protein_coding  CRBN-204        ENSG00… protein_… CRBN     
## 10 ENST00000698416.1 retained_intron TRNT1-231       ENSG00… protein_… TRNT1    
## # ℹ 36,707 more rows
## # ℹ 23 more variables: siMETTL2A_baseMean <dbl>,
## #   siMETTL2A_log2FoldChange <dbl>, siMETTL2A_lfcSE <dbl>,
## #   siMETTL2A_stat <dbl>, siMETTL2A_pvalue <dbl>, siMETTL2A_padj <dbl>,
## #   siMETTL2A_I_baseMean <dbl>, siMETTL2A_I_log2FoldChange <dbl>,
## #   siMETTL2A_I_lfcSE <dbl>, siMETTL2A_I_stat <dbl>, siMETTL2A_I_pvalue <dbl>,
## #   siMETTL2A_I_padj <dbl>, siMETTL2A_G_baseMean <dbl>, …
methylated_positions <- 
  read_tsv(
    'Tables/DRS_m3C_sites/DRS_methylated_positions_relative_range_2024-04-22.tsv' |> paste_wd()
  )
## Rows: 489 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): transcript_id, gene_name, seqname, gene_type, ref_kmer, genetype2
## dbl (7): kmer_start, kmer_end, kmer_middle, length, rel_kmer_start, rel_kmer...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
methylated_positions
## # A tibble: 489 × 13
##    transcript_id     gene_name seqname gene_type    ref_kmer kmer_start kmer_end
##    <chr>             <chr>     <chr>   <chr>        <chr>         <dbl>    <dbl>
##  1 ENST00000429711.7 RPL32     chr3    protein_cod… GCCCA           423      427
##  2 ENST00000647248.2 RPL35A    chr3    protein_cod… ACCCC           381      385
##  3 ENST00000647248.2 RPL35A    chr3    protein_cod… CCCCT           382      386
##  4 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      CCCCG            58       62
##  5 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      ACCCT            76       80
##  6 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      ATCAA            94       98
##  7 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      GCCAC           149      153
##  8 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      ACCCC           154      158
##  9 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      CCCCC           155      159
## 10 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      CCCCA           156      160
## # ℹ 479 more rows
## # ℹ 6 more variables: kmer_middle <dbl>, genetype2 <chr>, length <dbl>,
## #   rel_kmer_start <dbl>, rel_kmer_middle <dbl>, rel_kmer_end <dbl>
methylated_RNAs <- 
  methylated_positions |> 
  select(starts_with('transcript_'), starts_with('gene_')) |> 
  distinct()
methylated_RNAs
## # A tibble: 71 × 3
##    transcript_id     gene_name gene_type     
##    <chr>             <chr>     <chr>         
##  1 ENST00000429711.7 RPL32     protein_coding
##  2 ENST00000647248.2 RPL35A    protein_coding
##  3 ENST00000389680.2 MT-RNR1   Mt_rRNA       
##  4 ENST00000361390.2 MT-ND1    protein_coding
##  5 ENST00000361453.3 MT-ND2    protein_coding
##  6 ENST00000387347.2 MT-RNR2   Mt_rRNA       
##  7 ENST00000361624.2 MT-CO1    protein_coding
##  8 ENST00000361739.1 MT-CO2    protein_coding
##  9 ENST00000361899.2 MT-ATP6   protein_coding
## 10 ENST00000361227.2 MT-ND3    protein_coding
## # ℹ 61 more rows
deseq2_normcount <- 
  read_tsv('Tables/Espresso/espresso_DESeq2_normcount__2024-05-21.tsv.gz' |> paste_wd())
## Rows: 36717 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): transcript_id, transcript_type, transcript_name, gene_id, gene_type...
## dbl (9): siMETTL2A_I_N1, siMETTL2A_I_N2, siMETTL2A_I_N3, siMETTL2A_G_N1, siM...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
deseq2_normcount
## # A tibble: 36,717 × 16
##    transcript_id     transcript_type transcript_name gene_id gene_type gene_name
##    <chr>             <chr>           <chr>           <chr>   <chr>     <chr>    
##  1 ENST00000498442.1 retained_intron CRBN-212        ENSG00… protein_… CRBN     
##  2 ENST00000459840.5 retained_intron CRBN-205        ENSG00… protein_… CRBN     
##  3 ENST00000231948.9 protein_coding  CRBN-201        ENSG00… protein_… CRBN     
##  4 ENST00000432408.6 protein_coding  CRBN-203        ENSG00… protein_… CRBN     
##  5 ENST00000339437.… protein_coding  TRNT1-203       ENSG00… protein_… TRNT1    
##  6 ENST00000488263.5 retained_intron CRBN-209        ENSG00… protein_… CRBN     
##  7 ENST00000420393.5 protein_coding  TRNT1-207       ENSG00… protein_… TRNT1    
##  8 ENST00000698415.1 retained_intron TRNT1-230       ENSG00… protein_… TRNT1    
##  9 ENST00000450014.1 protein_coding  CRBN-204        ENSG00… protein_… CRBN     
## 10 ENST00000698416.1 retained_intron TRNT1-231       ENSG00… protein_… TRNT1    
## # ℹ 36,707 more rows
## # ℹ 10 more variables: siMETTL2A_I_N1 <dbl>, siMETTL2A_I_N2 <dbl>,
## #   siMETTL2A_I_N3 <dbl>, siMETTL2A_G_N1 <dbl>, siMETTL2A_G_N2 <dbl>,
## #   siMETTL2A_G_N3 <dbl>, Cont_D_N1 <dbl>, Cont_D_N2 <dbl>, Cont_D_N3 <dbl>,
## #   seqname <chr>
cpms <- 
  read_tsv(
    'Tables/DRS_quantification/espresso_quantification_cpm_2024-04-19.tsv.gz' |> paste_wd()
  )
## Rows: 330453 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (14): transcript_id, transcript_name, gene_id, type, si, seqname, source...
## dbl  (6): rep, count, total_reads, cpm, start, end
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cpms
## # A tibble: 330,453 × 20
##    transcript_id     transcript_name gene_id type  si      rep count total_reads
##    <chr>             <chr>           <chr>   <chr> <chr> <dbl> <dbl>       <dbl>
##  1 ENST00000498442.1 CRBN-212        ENSG00… siME… I         1  0        3552783
##  2 ENST00000498442.1 CRBN-212        ENSG00… siME… I         2  1         997879
##  3 ENST00000498442.1 CRBN-212        ENSG00… siME… I         3  0        2778705
##  4 ENST00000498442.1 CRBN-212        ENSG00… siME… G         1  0        3497396
##  5 ENST00000498442.1 CRBN-212        ENSG00… siME… G         2  0        3810844
##  6 ENST00000498442.1 CRBN-212        ENSG00… siME… G         3  0        3668094
##  7 ENST00000498442.1 CRBN-212        ENSG00… Cont  D         1  1        2701773
##  8 ENST00000498442.1 CRBN-212        ENSG00… Cont  D         2  1        3406597
##  9 ENST00000498442.1 CRBN-212        ENSG00… Cont  D         3  0        3653792
## 10 ENST00000459840.5 CRBN-205        ENSG00… siME… I         1  1.08     3552783
## # ℹ 330,443 more rows
## # ℹ 12 more variables: cpm <dbl>, seqname <chr>, source <chr>, feature <chr>,
## #   start <dbl>, end <dbl>, score <chr>, strand <chr>, frame <chr>,
## #   gene_type <chr>, gene_name <chr>, transcript_type <chr>
readcount_espressso_unspliced <- 
  fs::dir_ls(
    '/Volumes/Mitsu_NGS_2/METTL2A/Alignment/Minimap2/Espresso_unspliced/', 
    glob = '*_count.txt'
  ) |> 
  map(read_readcounts_Espresso_unspliced) |> 
  reduce(bind_rows)
## Rows: 26582 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: " "
## chr (1): transcript_id
## dbl (1): count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 20396 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: " "
## chr (1): transcript_id
## dbl (1): count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 24993 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: " "
## chr (1): transcript_id
## dbl (1): count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 25198 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: " "
## chr (1): transcript_id
## dbl (1): count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 25097 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: " "
## chr (1): transcript_id
## dbl (1): count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 24350 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: " "
## chr (1): transcript_id
## dbl (1): count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 25403 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: " "
## chr (1): transcript_id
## dbl (1): count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 25027 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: " "
## chr (1): transcript_id
## dbl (1): count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 25832 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: " "
## chr (1): transcript_id
## dbl (1): count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
readcount_espressso_unspliced
## # A tibble: 222,878 × 3
##    count transcript_id      basename                                  
##    <dbl> <chr>              <chr>                                     
##  1   266 ENST00000000233.10 221117_DrTaniue_7_siMETTL2A_I_N1_count.txt
##  2    29 ENST00000000412.8  221117_DrTaniue_7_siMETTL2A_I_N1_count.txt
##  3    34 ENST00000000442.11 221117_DrTaniue_7_siMETTL2A_I_N1_count.txt
##  4   114 ENST00000001008.6  221117_DrTaniue_7_siMETTL2A_I_N1_count.txt
##  5    12 ENST00000002125.9  221117_DrTaniue_7_siMETTL2A_I_N1_count.txt
##  6   120 ENST00000002165.11 221117_DrTaniue_7_siMETTL2A_I_N1_count.txt
##  7     2 ENST00000002501.11 221117_DrTaniue_7_siMETTL2A_I_N1_count.txt
##  8    19 ENST00000002596.6  221117_DrTaniue_7_siMETTL2A_I_N1_count.txt
##  9   178 ENST00000003100.13 221117_DrTaniue_7_siMETTL2A_I_N1_count.txt
## 10     2 ENST00000003583.12 221117_DrTaniue_7_siMETTL2A_I_N1_count.txt
## # ℹ 222,868 more rows
espresso_AsPC1_seqs <- 
  read_tsv(
    'Tables/Database/espresso_AsPC1_transcriptome_seqs_2024-04-22.tsv.gz' |> paste_wd()
  )
## Rows: 36717 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): transcript_id, transcript_seq
## dbl (1): transcript_length
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
espresso_AsPC1_seqs
## # A tibble: 36,717 × 3
##    transcript_id      transcript_seq                           transcript_length
##    <chr>              <chr>                                                <dbl>
##  1 ENST00000339437.11 AGCCCGGAAGTGCGCGTGGCGGCGGTGGCGGCTGCGGCA…               987
##  2 ENST00000251607.11 AGCCCGGAAGTGCGCGTGGCGGCGGTGGCGGCTGCGGCA…              2252
##  3 ENST00000420393.5  CAGCGGGGCCGGTAAGCGGGCGCGCGCCGCTCAGAGGGG…               854
##  4 ENST00000698415.1  GATGTATGATGAGTTTAGTTGAATGCTCGTGTTGCTGTC…              6597
##  5 ENST00000698416.1  CATGACTAGTTTTGTGGGTAGCAATGATGTTTAAATGTC…              5500
##  6 ENST00000488263.5  AGGAACTTCATCATGAAGTCTCAAGTAAACGAACATTTT…              4528
##  7 ENST00000424814.5  GAGATCAGCAGGACGCTGCGCACAACATGGGCAACCACC…              2038
##  8 ENST00000231948.9  AGACATGGCCGGCGAAGGAGATCAGCAGGACGCTGCGCA…              2187
##  9 ENST00000432408.6  GCCTCCTTTGCGGGTAAACAGACATGGCCGGCGAAGGAG…              2203
## 10 ENST00000459840.5  ATGGAGGCATTTAAACTGGGACTGAGATGGGACTGAGTG…               723
## # ℹ 36,707 more rows

Calculate average normcount

# deseq2_normcount_mean <- 
#   deseq2_normcount |> 
#   select(transcript_id, starts_with('siMETTL2A_'), starts_with('Cont_')) |> 
#   pivot_longer(cols = -transcript_id) |> 
#   group_by(transcript_id) |> 
#   reframe(mean_normcount = mean(value, na.rm = TRUE))

mean_CPMs <- 
  cpms |> 
  group_by(transcript_id) |> 
  reframe(
    mean_CPMs = mean(cpm, na.rm = TRUE),
    total_counts = sum(count, na.rm = TRUE)
  )
mean_CPMs
## # A tibble: 36,717 × 3
##    transcript_id      mean_CPMs total_counts
##    <chr>                  <dbl>        <dbl>
##  1 ENST00000000233.10   46.8          1222. 
##  2 ENST00000000412.8     7.60          218  
##  3 ENST00000000442.11    7.30          188. 
##  4 ENST00000001008.6    16.9           482  
##  5 ENST00000002125.9     2.12           48.8
##  6 ENST00000002165.11   21.6           576. 
##  7 ENST00000002501.11    0.0703          2  
##  8 ENST00000002596.6     1.93           52  
##  9 ENST00000003100.13   20.5           584. 
## 10 ENST00000003583.12    0.164           4.6
## # ℹ 36,707 more rows
sum_readcount_espresso_unspliced <- 
  readcount_espressso_unspliced |> 
  group_by(transcript_id) |> 
  reframe(
    total_read = sum(count, na.rm = TRUE),
    min_read   = min(count, na.rm = TRUE)
  ) |> 
  mutate(mean_read = total_read / 9)
sum_readcount_espresso_unspliced
## # A tibble: 33,173 × 4
##    transcript_id      total_read min_read mean_read
##    <chr>                   <dbl>    <dbl>     <dbl>
##  1 ENST00000000233.10       1359       85   151    
##  2 ENST00000000412.8         519        8    57.7  
##  3 ENST00000000442.11        227        8    25.2  
##  4 ENST00000001008.6         809       23    89.9  
##  5 ENST00000002125.9          83        4     9.22 
##  6 ENST00000002165.11        589       33    65.4  
##  7 ENST00000002501.11          7        1     0.778
##  8 ENST00000002596.6         101        1    11.2  
##  9 ENST00000003100.13       1417       40   157.   
## 10 ENST00000003583.12         23        1     2.56 
## # ℹ 33,163 more rows

Calculate GC% …

espresso_AsPC1_RNAinfo <- 
  espresso_AsPC1_seqs |> 
  mutate(
    num_GC = str_count(transcript_seq, 'G|C'),
    num_C = str_count(transcript_seq, 'C'),
    num_CC = str_count(transcript_seq, 'CC')
  ) |> 
  mutate(
    GC_content = num_GC / transcript_length,
    C_content = num_C / transcript_length,
    CC_content = num_CC / transcript_length
  )
espresso_AsPC1_RNAinfo
## # A tibble: 36,717 × 9
##    transcript_id transcript_seq transcript_length num_GC num_C num_CC GC_content
##    <chr>         <chr>                      <dbl>  <int> <int>  <int>      <dbl>
##  1 ENST00000339… AGCCCGGAAGTGC…               987    406   182     35      0.411
##  2 ENST00000251… AGCCCGGAAGTGC…              2252    838   353     60      0.372
##  3 ENST00000420… CAGCGGGGCCGGT…               854    459   215     53      0.537
##  4 ENST00000698… GATGTATGATGAG…              6597   2365  1105    179      0.358
##  5 ENST00000698… CATGACTAGTTTT…              5500   1892   893    143      0.344
##  6 ENST00000488… AGGAACTTCATCA…              4528   1621   736    126      0.358
##  7 ENST00000424… GAGATCAGCAGGA…              2038    763   368     73      0.374
##  8 ENST00000231… AGACATGGCCGGC…              2187    835   400     80      0.382
##  9 ENST00000432… GCCTCCTTTGCGG…              2203    844   405     82      0.383
## 10 ENST00000459… ATGGAGGCATTTA…               723    282   132     26      0.390
## # ℹ 36,707 more rows
## # ℹ 2 more variables: C_content <dbl>, CC_content <dbl>
espresso_deseq2_m3Cinfo <- 
  espresso_deseq2 |> 
  left_join(methylated_RNAs |> mutate(m3C = 'm3C')) |> 
  replace_na(list(m3C = 'other'))
## Joining with `by = join_by(transcript_id, gene_type, gene_name)`
espresso_deseq2_m3Cinfo |> 
  export_tsv(outdir = tabledir, compression = 'gz')
## 
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/Espresso/espresso_deseq2_m3Cinfo_2024-07-31.tsv.gz
## # A tibble: 36,717 × 30
##    transcript_id     transcript_type transcript_name gene_id gene_type gene_name
##    <chr>             <chr>           <chr>           <chr>   <chr>     <chr>    
##  1 ENST00000498442.1 retained_intron CRBN-212        ENSG00… protein_… CRBN     
##  2 ENST00000459840.5 retained_intron CRBN-205        ENSG00… protein_… CRBN     
##  3 ENST00000231948.9 protein_coding  CRBN-201        ENSG00… protein_… CRBN     
##  4 ENST00000432408.6 protein_coding  CRBN-203        ENSG00… protein_… CRBN     
##  5 ENST00000339437.… protein_coding  TRNT1-203       ENSG00… protein_… TRNT1    
##  6 ENST00000488263.5 retained_intron CRBN-209        ENSG00… protein_… CRBN     
##  7 ENST00000420393.5 protein_coding  TRNT1-207       ENSG00… protein_… TRNT1    
##  8 ENST00000698415.1 retained_intron TRNT1-230       ENSG00… protein_… TRNT1    
##  9 ENST00000450014.1 protein_coding  CRBN-204        ENSG00… protein_… CRBN     
## 10 ENST00000698416.1 retained_intron TRNT1-231       ENSG00… protein_… TRNT1    
## # ℹ 36,707 more rows
## # ℹ 24 more variables: siMETTL2A_baseMean <dbl>,
## #   siMETTL2A_log2FoldChange <dbl>, siMETTL2A_lfcSE <dbl>,
## #   siMETTL2A_stat <dbl>, siMETTL2A_pvalue <dbl>, siMETTL2A_padj <dbl>,
## #   siMETTL2A_I_baseMean <dbl>, siMETTL2A_I_log2FoldChange <dbl>,
## #   siMETTL2A_I_lfcSE <dbl>, siMETTL2A_I_stat <dbl>, siMETTL2A_I_pvalue <dbl>,
## #   siMETTL2A_I_padj <dbl>, siMETTL2A_G_baseMean <dbl>, …
m3CRNA_transcriptinfo <- 
  espresso_deseq2_m3Cinfo |> 
  select(
    starts_with('transcript_'), starts_with('gene_'), seqname, genetype2, 
    m3C, common_DETs
  ) |> 
  left_join(sum_readcount_espresso_unspliced) |> 
  left_join(espresso_AsPC1_RNAinfo)
## Joining with `by = join_by(transcript_id)`
## Joining with `by = join_by(transcript_id)`
m3CRNA_transcriptinfo
## # A tibble: 36,717 × 21
##    transcript_id     transcript_type transcript_name gene_id gene_type gene_name
##    <chr>             <chr>           <chr>           <chr>   <chr>     <chr>    
##  1 ENST00000498442.1 retained_intron CRBN-212        ENSG00… protein_… CRBN     
##  2 ENST00000459840.5 retained_intron CRBN-205        ENSG00… protein_… CRBN     
##  3 ENST00000231948.9 protein_coding  CRBN-201        ENSG00… protein_… CRBN     
##  4 ENST00000432408.6 protein_coding  CRBN-203        ENSG00… protein_… CRBN     
##  5 ENST00000339437.… protein_coding  TRNT1-203       ENSG00… protein_… TRNT1    
##  6 ENST00000488263.5 retained_intron CRBN-209        ENSG00… protein_… CRBN     
##  7 ENST00000420393.5 protein_coding  TRNT1-207       ENSG00… protein_… TRNT1    
##  8 ENST00000698415.1 retained_intron TRNT1-230       ENSG00… protein_… TRNT1    
##  9 ENST00000450014.1 protein_coding  CRBN-204        ENSG00… protein_… CRBN     
## 10 ENST00000698416.1 retained_intron TRNT1-231       ENSG00… protein_… TRNT1    
## # ℹ 36,707 more rows
## # ℹ 15 more variables: seqname <chr>, genetype2 <chr>, m3C <chr>,
## #   common_DETs <chr>, total_read <dbl>, min_read <dbl>, mean_read <dbl>,
## #   transcript_seq <chr>, transcript_length <dbl>, num_GC <int>, num_C <int>,
## #   num_CC <int>, GC_content <dbl>, C_content <dbl>, CC_content <dbl>
m3CRNA_transcriptinfo |> 
  export_tsv(outdir = tabledir, compression = 'gz')
## 
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/Espresso/m3CRNA_transcriptinfo_2024-07-31.tsv.gz
## # A tibble: 36,717 × 21
##    transcript_id     transcript_type transcript_name gene_id gene_type gene_name
##    <chr>             <chr>           <chr>           <chr>   <chr>     <chr>    
##  1 ENST00000498442.1 retained_intron CRBN-212        ENSG00… protein_… CRBN     
##  2 ENST00000459840.5 retained_intron CRBN-205        ENSG00… protein_… CRBN     
##  3 ENST00000231948.9 protein_coding  CRBN-201        ENSG00… protein_… CRBN     
##  4 ENST00000432408.6 protein_coding  CRBN-203        ENSG00… protein_… CRBN     
##  5 ENST00000339437.… protein_coding  TRNT1-203       ENSG00… protein_… TRNT1    
##  6 ENST00000488263.5 retained_intron CRBN-209        ENSG00… protein_… CRBN     
##  7 ENST00000420393.5 protein_coding  TRNT1-207       ENSG00… protein_… TRNT1    
##  8 ENST00000698415.1 retained_intron TRNT1-230       ENSG00… protein_… TRNT1    
##  9 ENST00000450014.1 protein_coding  CRBN-204        ENSG00… protein_… CRBN     
## 10 ENST00000698416.1 retained_intron TRNT1-231       ENSG00… protein_… TRNT1    
## # ℹ 36,707 more rows
## # ℹ 15 more variables: seqname <chr>, genetype2 <chr>, m3C <chr>,
## #   common_DETs <chr>, total_read <dbl>, min_read <dbl>, mean_read <dbl>,
## #   transcript_seq <chr>, transcript_length <dbl>, num_GC <int>, num_C <int>,
## #   num_CC <int>, GC_content <dbl>, C_content <dbl>, CC_content <dbl>

Plots

Mean read count

m3CRNA_transcriptinfo |> 
  rstatix::wilcox_test(mean_read ~ m3C)
## # A tibble: 1 × 7
##   .y.       group1 group2    n1    n2 statistic        p
## * <chr>     <chr>  <chr>  <int> <int>     <dbl>    <dbl>
## 1 mean_read m3C    other     71 33102   2348049 5.31e-48
ecdf_meanread_m3C <- 
  m3CRNA_transcriptinfo |> 
  ggplot(aes(x = mean_read, colour = m3C)) +
  stat_ecdf(lwd = 1.1) +
  geom_vline(xintercept = c(200, 300)) +
  scale_x_log10() +
  scale_color_manual(values = c('#00998C', '#808080'))
ecdf_meanread_m3C |> 
  ggsave_multiple_formats(
    outdir = figdir, width = 6, height = 4, fontsize = 7
  )
## Warning: Removed 3544 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Removed 3544 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Removed 3544 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Removed 3544 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Removed 3544 rows containing non-finite outside the scale range
## (`stat_ecdf()`).

m3CRNA_transcriptinfo |> 
  filter(m3C == 'm3C') |> 
  arrange(mean_read) |> 
  select(transcript_name, mean_read)
## # A tibble: 71 × 2
##    transcript_name mean_read
##    <chr>               <dbl>
##  1 RPLP0-226            283.
##  2 CEACAM6-201          304.
##  3 ATP5F1A-202          328.
##  4 PRELID1-201          363.
##  5 H3-3B-201            365 
##  6 MDK-203              471.
##  7 RPLP0-219            531.
##  8 ATP5MJ-201           571.
##  9 TOMM7-201            703 
## 10 SH3BGRL3-201         783.
## # ℹ 61 more rows
m3CRNA_transcriptinfo |> 
  filter(mean_read > 283) |> 
  group_by(m3C) |> 
  reframe(n = n())
## # A tibble: 2 × 2
##   m3C       n
##   <chr> <int>
## 1 m3C      71
## 2 other   296
m3CRNA_transcriptinfo |> 
  filter(mean_read > 283) |> 
  filter(m3C != 'm3C') |> 
  arrange(-mean_read) |> 
  select(transcript_name, mean_read)
## # A tibble: 296 × 2
##    transcript_name mean_read
##    <chr>               <dbl>
##  1 RPS18-236           3026.
##  2 RPL31-201           2547 
##  3 TMSB4X-204          2150.
##  4 RPS15A-203          2141.
##  5 RPS27A-201          1727.
##  6 MT-ND5-201          1688.
##  7 RPL23A-204          1676.
##  8 S100A11-201         1619.
##  9 RPS10-209           1529.
## 10 ENST00000423610     1521.
## # ℹ 286 more rows
m3CRNA_transcriptinfo |> 
  filter(mean_read > 283) |> 
  #filter(m3C != 'm3C') |> 
  group_by(m3C, mean_read > 1000) |> 
  reframe(n = n())
## # A tibble: 4 × 3
##   m3C   `mean_read > 1000`     n
##   <chr> <lgl>              <int>
## 1 m3C   FALSE                 14
## 2 m3C   TRUE                  57
## 3 other FALSE                252
## 4 other TRUE                  44
m3CRNA_transcriptinfo |> 
  filter(mean_read > 100) |> 
  ggplot(aes(x = mean_read, colour = m3C)) +
  stat_ecdf(lwd = 1.1) +
  geom_vline(xintercept = c(200, 300)) +
  scale_x_log10() +
  scale_color_manual(values = c('#00998C', '#808080'))

Mean read count

m3CRNA_transcriptinfo |> 
  rstatix::wilcox_test(min_read ~ m3C)
## # A tibble: 1 × 7
##   .y.      group1 group2    n1    n2 statistic       p
## * <chr>    <chr>  <chr>  <int> <int>     <dbl>   <dbl>
## 1 min_read m3C    other     71 33102  2348102. 2.8e-59
ecdf_minread_m3C <- 
  m3CRNA_transcriptinfo |> 
  ggplot(aes(x = min_read, colour = m3C)) +
  stat_ecdf(lwd = 1.1) +
  geom_vline(xintercept = c(100)) +
  scale_x_log10() +
  scale_color_manual(values = c('#00998C', '#808080'))
ecdf_minread_m3C |> 
  ggsave_multiple_formats(
    outdir = figdir, width = 6, height = 4, fontsize = 7
  )
## Warning: Removed 3544 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Removed 3544 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Removed 3544 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Removed 3544 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Removed 3544 rows containing non-finite outside the scale range
## (`stat_ecdf()`).

m3CRNA_transcriptinfo |> 
  filter(min_read > 100) |> 
  group_by(m3C) |> 
  reframe(n = n())
## # A tibble: 2 × 2
##   m3C       n
##   <chr> <int>
## 1 m3C      71
## 2 other   330
num_m3Csites <- 
  methylated_positions |> 
  group_by(transcript_id) |> 
  reframe(num_m3Csites = n())
num_m3Csites
## # A tibble: 71 × 2
##    transcript_id      num_m3Csites
##    <chr>                     <int>
##  1 ENST00000009589.8             1
##  2 ENST00000199764.7             1
##  3 ENST00000202773.14            2
##  4 ENST00000215754.8             4
##  5 ENST00000229239.10            2
##  6 ENST00000230050.4             4
##  7 ENST00000233143.6            15
##  8 ENST00000234875.9             2
##  9 ENST00000243997.8             3
## 10 ENST00000254810.8             1
## # ℹ 61 more rows
correlation_minread_m3Cfraction <- 
  m3CRNA_transcriptinfo |> 
  filter(min_read > 10) |> 
  left_join(num_m3Csites) |> 
  replace_na(list(num_m3Csites = 0)) |> 
  ggplot(aes(x = min_read , y = num_m3Csites / num_C)) +
  geom_hex(bins = 50) +
  scale_x_log10() +
  scale_fill_viridis_c(trans = 'log10') +
  geom_vline(xintercept = c(100), color = 'gray20') 
## Joining with `by = join_by(transcript_id)`
correlation_minread_m3Cfraction |> 
  ggsave_multiple_formats(
    outdir = figdir, width = 6, height = 6, fontsize = 7
  )

nanocompore_wellexpressed <- 
  m3CRNA_transcriptinfo |> 
  filter(min_read > 100) |> 
  select(transcript_id) |> 
  distinct() |> 
  left_join(
    read_tsv(
      'Tables/DRS_m3C_sites/sampcomp_results_joined_2024-04-24.tsv.gz' |> paste_wd()
    ),
    by = join_by(transcript_id)
  ) |> 
  filter(!is.na(intensity_up))
## Rows: 5884004 Columns: 67
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (34): transcript_id, transcript_name, ref_kmer, GMM_cov_type_G, cluster_...
## dbl (33): position, GMM_logit_pvalue_G, KS_dwell_pvalue_G, KS_intensity_pval...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nanocompore_wellexpressed 
## # A tibble: 397,223 × 67
##    transcript_id     transcript_name position ref_kmer GMM_logit_pvalue_G
##    <chr>             <chr>              <dbl> <chr>                 <dbl>
##  1 ENST00000429711.7 RPL32-204             30 TCCTC                    NA
##  2 ENST00000429711.7 RPL32-204             31 CCTCG                     1
##  3 ENST00000429711.7 RPL32-204             32 CTCGG                     1
##  4 ENST00000429711.7 RPL32-204             33 TCGGC                     1
##  5 ENST00000429711.7 RPL32-204             34 CGGCG                     1
##  6 ENST00000429711.7 RPL32-204             35 GGCGC                     1
##  7 ENST00000429711.7 RPL32-204             36 GCGCT                     1
##  8 ENST00000429711.7 RPL32-204             37 CGCTG                     1
##  9 ENST00000429711.7 RPL32-204             38 GCTGC                     1
## 10 ENST00000429711.7 RPL32-204             39 CTGCC                     1
## # ℹ 397,213 more rows
## # ℹ 62 more variables: KS_dwell_pvalue_G <dbl>, KS_intensity_pvalue_G <dbl>,
## #   GMM_cov_type_G <chr>, GMM_n_clust_G <dbl>, cluster_counts_G <chr>,
## #   Logit_LOR_G <chr>, c1_mean_intensity_G <dbl>, c2_mean_intensity_G <dbl>,
## #   c1_median_intensity_G <dbl>, c2_median_intensity_G <dbl>,
## #   c1_sd_intensity_G <dbl>, c2_sd_intensity_G <dbl>, c1_mean_dwell_G <dbl>,
## #   c2_mean_dwell_G <dbl>, c1_median_dwell_G <dbl>, c2_median_dwell_G <dbl>, …
nanocompore_wellexpressed |> 
  group_by(intensity_up, middle_isC) |> 
  reframe(n = n()) |> 
  mutate(percent = 100 * n /sum(n))
## # A tibble: 8 × 4
##   intensity_up middle_isC      n percent
##   <chr>        <chr>       <int>   <dbl>
## 1 common       C             489  0.123 
## 2 common       others        113  0.0284
## 3 only G       C            1422  0.358 
## 4 only G       others        802  0.202 
## 5 only I       C             265  0.0667
## 6 only I       others        168  0.0423
## 7 others       C           93755 23.6   
## 8 others       others     300209 75.6

% of RNAs with intensity up sites or m3C sites

Calculate

num_m3C_intensityup_wellexpressed <- 
  nanocompore_wellexpressed |> 
  group_by(transcript_id) |>
  reframe(
    num_intensityup = sum(intensity_up == 'common', na.rm = TRUE),
    num_m3C = sum(intensity_up == 'common' & middle_isC == 'C', na.rm = TRUE)
  ) |>
  mutate(
    have_intensityup = ifelse(num_intensityup > 0, 'yes', 'no'),
    have_m3C         = ifelse(num_m3C > 0, 'yes', 'no')
  )
num_m3C_intensityup_wellexpressed
## # A tibble: 400 × 5
##    transcript_id      num_intensityup num_m3C have_intensityup have_m3C
##    <chr>                        <int>   <int> <chr>            <chr>   
##  1 ENST00000007516.8                0       0 no               no      
##  2 ENST00000009180.10               0       0 no               no      
##  3 ENST00000009589.8                1       1 yes              yes     
##  4 ENST00000027335.8                0       0 no               no      
##  5 ENST00000175091.5                0       0 no               no      
##  6 ENST00000184266.3                0       0 no               no      
##  7 ENST00000196551.8                0       0 no               no      
##  8 ENST00000199764.7                1       1 yes              yes     
##  9 ENST00000202773.14               3       2 yes              yes     
## 10 ENST00000215754.8                4       4 yes              yes     
## # ℹ 390 more rows
percent_have_intensityup_wellexpressed <- 
  num_m3C_intensityup_wellexpressed |> 
  group_by(have_intensityup) |> 
  reframe(n = n()) |> 
  mutate(percentage =  100 * n / sum(n))
percent_have_intensityup_wellexpressed
## # A tibble: 2 × 3
##   have_intensityup     n percentage
##   <chr>            <int>      <dbl>
## 1 no                 318       79.5
## 2 yes                 82       20.5
percent_have_m3C_wellexpressed <- 
  num_m3C_intensityup_wellexpressed |> 
  group_by(have_m3C) |> 
  reframe(n = n())  |> 
  mutate(percentage =  100 * n / sum(n))
percent_have_m3C_wellexpressed
## # A tibble: 2 × 3
##   have_m3C     n percentage
##   <chr>    <int>      <dbl>
## 1 no         329       82.2
## 2 yes         71       17.8

Donut plot

donutplot_have_intensityup <- 
  percent_have_intensityup_wellexpressed |> 
  donutplot(have_intensityup)
donutplot_have_intensityup |> 
  ggsave_multiple_formats(
    width = 5, height = 5, fontsize = 7, outdir = figdir
  )

donutplot_have_m3C <- 
  percent_have_m3C_wellexpressed |> 
  donutplot(have_m3C)
donutplot_have_m3C |> 
  ggsave_multiple_formats(
    width = 5, height = 5, fontsize = 7, outdir = figdir
  )

% of sites with intensity up

nanocompore_wellexpressed |> 
  filter(!is.na(intensity_up))
## # A tibble: 397,223 × 67
##    transcript_id     transcript_name position ref_kmer GMM_logit_pvalue_G
##    <chr>             <chr>              <dbl> <chr>                 <dbl>
##  1 ENST00000429711.7 RPL32-204             30 TCCTC                    NA
##  2 ENST00000429711.7 RPL32-204             31 CCTCG                     1
##  3 ENST00000429711.7 RPL32-204             32 CTCGG                     1
##  4 ENST00000429711.7 RPL32-204             33 TCGGC                     1
##  5 ENST00000429711.7 RPL32-204             34 CGGCG                     1
##  6 ENST00000429711.7 RPL32-204             35 GGCGC                     1
##  7 ENST00000429711.7 RPL32-204             36 GCGCT                     1
##  8 ENST00000429711.7 RPL32-204             37 CGCTG                     1
##  9 ENST00000429711.7 RPL32-204             38 GCTGC                     1
## 10 ENST00000429711.7 RPL32-204             39 CTGCC                     1
## # ℹ 397,213 more rows
## # ℹ 62 more variables: KS_dwell_pvalue_G <dbl>, KS_intensity_pvalue_G <dbl>,
## #   GMM_cov_type_G <chr>, GMM_n_clust_G <dbl>, cluster_counts_G <chr>,
## #   Logit_LOR_G <chr>, c1_mean_intensity_G <dbl>, c2_mean_intensity_G <dbl>,
## #   c1_median_intensity_G <dbl>, c2_median_intensity_G <dbl>,
## #   c1_sd_intensity_G <dbl>, c2_sd_intensity_G <dbl>, c1_mean_dwell_G <dbl>,
## #   c2_mean_dwell_G <dbl>, c1_median_dwell_G <dbl>, c2_median_dwell_G <dbl>, …
percent_intensityup_sites_wellexpressed <- 
  nanocompore_wellexpressed |> 
  mutate(common_intensity_up = intensity_up == 'common') |> 
  group_by(common_intensity_up) |>
  reframe(n = n()) |> 
  mutate(percentage = 100 * n / sum(n)) 
percent_intensityup_sites_wellexpressed
## # A tibble: 2 × 3
##   common_intensity_up      n percentage
##   <lgl>                <int>      <dbl>
## 1 FALSE               396621     99.8  
## 2 TRUE                   602      0.152
donutplot_intensityup_sites <- 
  percent_intensityup_sites_wellexpressed |> 
  donutplot(common_intensity_up)
donutplot_intensityup_sites |> 
  ggsave_multiple_formats(
    width = 5, height = 5, fontsize = 7, outdir = figdir
  )

Length

m3CRNA_transcriptinfo |> 
  rstatix::wilcox_test(transcript_length ~ m3C)
## # A tibble: 1 × 7
##   .y.               group1 group2    n1    n2 statistic       p
## * <chr>             <chr>  <chr>  <int> <int>     <dbl>   <dbl>
## 1 transcript_length m3C    other     71 36646    975083 0.00026
ecdf_RNAlength_m3C <- 
  m3CRNA_transcriptinfo |> 
  ggplot(aes(x = transcript_length, colour = m3C)) +
  stat_ecdf(lwd = 1.1) +
  scale_x_log10() +
  scale_color_manual(values = c('#00998C', '#808080'))
ecdf_RNAlength_m3C |> 
  ggsave_multiple_formats(
    outdir = figdir, width = 6, height = 4, fontsize = 7
  )

# m3CRNA_transcriptinfo |> 
#   filter(genetype2 == 'mRNA') |> 
#   rstatix::wilcox_test(mean_normcount ~ m3C)


ecdf_mRNAlength_m3C <- 
  m3CRNA_transcriptinfo |> 
  filter(genetype2 == 'mRNA') |> 
  ggplot(aes(x = transcript_length, colour = m3C)) +
  stat_ecdf(lwd = 1.1) +
  scale_x_log10() +
  scale_color_manual(values = c('#00998C', '#808080'))
ecdf_mRNAlength_m3C |> 
  ggsave_multiple_formats(
    outdir = figdir, width = 6, height = 4, fontsize = 7
  )

C content

m3CRNA_transcriptinfo |> 
  rstatix::wilcox_test(C_content ~ m3C)
## # A tibble: 1 × 7
##   .y.       group1 group2    n1    n2 statistic      p
## * <chr>     <chr>  <chr>  <int> <int>     <dbl>  <dbl>
## 1 C_content m3C    other     71 36646  1524682. 0.0122
ecdf_Ccontent_m3C <- 
  m3CRNA_transcriptinfo |> 
  ggplot(aes(x = C_content, colour = m3C)) +
  stat_ecdf(lwd = 1.1) +
  scale_color_manual(values = c('#00998C', '#808080'))
ecdf_Ccontent_m3C |> 
  ggsave_multiple_formats(
    outdir = figdir, width = 6, height = 4, fontsize = 7
  )

CC content

m3CRNA_transcriptinfo |> 
  rstatix::wilcox_test(CC_content ~ m3C)
## # A tibble: 1 × 7
##   .y.        group1 group2    n1    n2 statistic      p
## * <chr>      <chr>  <chr>  <int> <int>     <dbl>  <dbl>
## 1 CC_content m3C    other     71 36646   1521939 0.0133
ecdf_CCcontent_m3C <- 
  m3CRNA_transcriptinfo |> 
  ggplot(aes(x = CC_content, colour = m3C)) +
  stat_ecdf(lwd = 1.1) +
  scale_color_manual(values = c('#00998C', '#808080'))
ecdf_CCcontent_m3C |> 
  ggsave_multiple_formats(
    outdir = figdir, width = 6, height = 4, fontsize = 7
  )